What are the topics discussed in UK train companies reviews on Trustpilot ?
What could UK train companies learn from this to improve their services ?
import os
import re
import sys
import heapq
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from matplotlib.ticker import FuncFormatter
import matplotlib.colors as mcolors
from collections import Counter
from PIL import Image
# Custom functions and constants
cwd = os.path.join(os.getcwd())
sys.path.append(cwd)
import constants as cst
import visualizations as viz
import data_cleaning
import modelization
/opt/anaconda3/envs/okra_env/lib/python3.6/site-packages/scipy/sparse/sparsetools.py:21: DeprecationWarning: `scipy.sparse.sparsetools` is deprecated! scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
df_reviews = pd.read_json(cst.TRAIN_REVIEWS1)
df_reviews = data_cleaning.full_cleaning(df_reviews)
df_reviews
| date | url | stars | review | |
|---|---|---|---|---|
| 4 | 2011-05-28 15:00:36+00:00 | nationalrail | star-rating star-rating-4 star-rating--medium | [check, enquiry, planning, journey, allows, qu... |
| 1950 | 2011-11-20 20:43:09+00:00 | hulltrains | star-rating star-rating-4 star-rating--medium | [reasonable, price, first, class, spacious, wi... |
| 931 | 2015-01-07 23:32:34+00:00 | virgintrains | star-rating star-rating-5 star-rating--medium | [fantastic, amaze] |
| 3 | 2015-01-13 12:26:52+00:00 | nationalrail | star-rating star-rating-4 star-rating--medium | [save, single, specific, inexplicably, cheaper... |
| 930 | 2015-02-11 13:20:32+00:00 | virgintrains | star-rating star-rating-5 star-rating--medium | [coventry, wolverhampton, station, miss, helpf... |
| ... | ... | ... | ... | ... |
| 1151 | 2018-07-17 08:54:14+00:00 | arrivatrainswales | star-rating star-rating-1 star-rating--medium | [terrible, terrible] |
| 1880 | 2018-07-17 09:32:06+00:00 | gwr | star-rating star-rating-1 star-rating--medium | [star, appropriate, star, appropriate] |
| 1740 | 2018-07-17 09:32:06+00:00 | gwr | star-rating star-rating-1 star-rating--medium | [star, appropriate, star, appropriate] |
| 1730 | 2018-07-17 13:59:52+00:00 | gwr | star-rating star-rating-1 star-rating--medium | [awful, previous, commentator, star, appropria... |
| 1879 | 2018-07-17 13:59:52+00:00 | gwr | star-rating star-rating-1 star-rating--medium | [awful, previous, commentator, star, appropria... |
2021 rows × 4 columns
viz.plot_occ_over_time(df_reviews, "Number of Reviews per month")
We don't have recent reviews, we need recent reviews to be able to analyse relevant topics for the customer and to answer the business question. Let's scrap all the reviews for each uk train company on trustpilot
df_scraped_reviews = pd.read_csv(cst.TRAIN_REVIEWS2, sep='\t')
df_prepared_reviews = data_cleaning.full_cleaning(df_scraped_reviews)
df_prepared_reviews
| date | url | stars | review | |
|---|---|---|---|---|
| 374 | 2011-07-11 12:33:11+00:00 | eastmidlandstrains | 5 | [fantastic, lincoln, various, raileast, coast,... |
| 1144 | 2014-11-01 20:06:34+00:00 | virgintrains | 1 | [first, class, recently, birthday, party, cele... |
| 1143 | 2014-11-01 20:06:34+00:00 | virgintrains | 4 | [generally, nothing, wrong, book, excellent, p... |
| 1142 | 2014-11-01 20:06:34+00:00 | virgintrains | 4 | [fairly, recently, first, class, longer, busin... |
| 1141 | 2014-11-01 20:06:34+00:00 | virgintrains | 1 | [absolutely, awful, standard, class, book, adv... |
| ... | ... | ... | ... | ... |
| 2024 | 2020-11-12 14:44:50+00:00 | eurostar | 1 | [loyal, stay, time, generally, happy, expect, ... |
| 2023 | 2020-11-12 14:44:50+00:00 | eurostar | 1 | [eurotrash, book, message, cancel, issue, vouc... |
| 2022 | 2020-11-12 14:44:50+00:00 | eurostar | 1 | [purchase, purchase, grandchild, return, easte... |
| 2020 | 2020-11-12 14:44:50+00:00 | eurostar | 1 | [horrible, stressful, journey, horrible, staff... |
| 2035 | 2020-11-12 14:44:50+00:00 | eurostar | 1 | [bad, experience, bad, experience, yesterday, ... |
4712 rows × 4 columns
viz.plot_occ_over_time(df_prepared_reviews, "Number of Reviews per month")
Now we have 2 times more data and we have recent reviews, let's start the analysis.
viz.plot_occ(df_prepared_reviews, "trainCompany", "Number of reviews per train company")
We can observe that some companies get a lot more reviews than others.
viz.plot_occ(df_prepared_reviews, "stars", "Occurence of ratings", c='stars')
We can see that most reviews are bad reviews. We can also see that people are most likely to give an extreme rating.
viz.plot_ratings_per_company(df_prepared_reviews)
Every companies have mostly bad reviews. On the other hand, some companies are slightly better reviewed like gwr.
%%time
clean_reviews, id2word_1, corpus_1 = modelization.create_dict_corpus(df_prepared_reviews)
# Build LDA model with best coherence
ldamodel = modelization.viz_best_coherence(id2word_1, corpus=corpus_1, texts=clean_reviews)
CPU times: user 2min 45s, sys: 31.9 s, total: 3min 16s Wall time: 4min 29s
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis.gensim
from gensim.models import CoherenceModel
from gensim import matutils
# Nice dynamic viz to analyse topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(ldamodel, corpus=corpus_1, dictionary=id2word_1, sort_topics=False)
# Save the visualization to html
pyLDAvis.save_html(LDAvis_prepared, 'viz/general_lda_viz.html')
viz.wordcloud_topics_viz(ldamodel)
viz.viz_wordcount_importance_keywords(ldamodel, clean_reviews)
viz.viz_most_discussed_topics(ldamodel, corpus_1)
In order to have more informational topics, we discriminate positive reviews and negative reviews
# Positive reviews have 4 or 5 stars
df_pos_reviews = df_prepared_reviews[(df_prepared_reviews['stars'] == 5) | (df_prepared_reviews['stars'] == 4)]
%%time
reviews_pos, id2word_pos, corpus_pos = modelization.create_dict_corpus(df_pos_reviews)
# Build LDA model
ldamodel_pos = modelization.viz_best_coherence(id2word_pos, corpus=corpus_pos, texts=reviews_pos)
CPU times: user 1min 12s, sys: 7.04 s, total: 1min 19s Wall time: 1min 36s
# Nice dynamic viz to analyse topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(ldamodel_pos, corpus=corpus_pos, dictionary=id2word_pos, sort_topics=False)
# Save the visualization to html
pyLDAvis.save_html(LDAvis_prepared, 'viz/general_pos_lda_viz.html')
viz.wordcloud_topics_viz(ldamodel_pos)
viz.viz_wordcount_importance_keywords(ldamodel_pos, reviews_pos)
viz.viz_most_discussed_topics(ldamodel_pos, corpus_pos, 500)
# Positive reviews have 1 or 2 stars
df_neg_reviews = df_prepared_reviews[(df_prepared_reviews['stars'] == 1) | (df_prepared_reviews['stars'] == 2)]
%%time
reviews_neg, id2word_neg, corpus_neg = modelization.create_dict_corpus(df_neg_reviews)
# Build LDA model
ldamodel_neg = modelization.viz_best_coherence(id2word_neg, corpus=corpus_neg, texts=reviews_neg)
CPU times: user 4min 32s, sys: 33.8 s, total: 5min 5s Wall time: 6min 18s
# Nice dynamic viz to analyse topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(ldamodel_neg, corpus=corpus_neg, dictionary=id2word_neg, sort_topics=False)
# Save the visualization to html
pyLDAvis.save_html(LDAvis_prepared, 'viz/general_neg_lda_viz.html')
viz.wordcloud_topics_viz(ldamodel_neg)
viz.viz_wordcount_importance_keywords(ldamodel_neg, reviews_neg)
viz.viz_most_discussed_topics(ldamodel_neg, corpus_neg)
viz.plot_occ_over_time(df_scraped_reviews, "Number of Reviews per month")
We can see of the above plot that there are a lot of recent reviews but fewer reviews as we go to the past.
To analyse the evolution of topics over time, we will run a LDA on each time period (independantly on positive and negative reviews) we want to emphasize. Here we choose3 time periods :
periods = ["2012", "2018", "02-2020", "12-2020"]
dict_models_over_time = modelization.train_ldas(periods, df_pos_reviews, df_neg_reviews)
Training LDA for positive reviews of 2012_2018 period
Training LDA for negative reviews of 2012_2018 period
Training LDA for positive reviews of 2018_02-2020 period
Training LDA for negative reviews of 2018_02-2020 period
Training LDA for positive reviews of 02-2020_12-2020 period
Training LDA for negative reviews of 02-2020_12-2020 period
models_pos, corpus_pos, id2words_pos, reviews_pos = modelization.get_models_corpus_reviewslists(dict_models_over_time, pos=True)
models_neg, corpus_neg, id2words_neg, reviews_neg = modelization.get_models_corpus_reviewslists(dict_models_over_time, pos=False)
viz.viz_evolution_wordcounts_importance(periods, models_pos, reviews_pos)
Between 2012 and 2018
Between 2018 and 02-2020
Between 02-2020 and 12-2020
viz.viz_evolution_wordcounts_importance(periods, models_neg, reviews_neg)
Between 2012 and 2018
Between 2018 and 02-2020
Between 02-2020 and 12-2020
viz.viz_evolution_discussed_topics(periods, models_pos, corpus_pos)
Between 2012 and 2018
Between 2018 and 02-2020
Between 02-2020 and 12-2020
viz.viz_evolution_discussed_topics(periods, models_neg, corpus_neg)
Between 2012 and 2018
Between 2018 and 02-2020
Between 02-2020 and 12-2020
viz.viz_evolution_wordclouds(periods, models_pos)
Between 2012 and 2018
Between 2018 and 02-2020
Between 02-2020 and 12-2020
viz.viz_evolution_wordclouds(periods, models_neg)
Between 2012 and 2018
Between 2018 and 02-2020
Between 02-2020 and 12-2020
viz.export_evolution_LDAvis_html(periods, models_pos, corpus_pos, id2words_pos)
viz.export_evolution_LDAvis_html(periods, models_neg, corpus_neg, id2words_neg, False)